Import¶

In [1]:
import numpy as np

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import classification_report

from scipy.stats import mode

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.offline as pyo
pyo.init_notebook_mode()

# Local imports
from ipynb.fs.defs.task3_1 import DatasetManager
from ipynb.fs.defs.task3_2 import ModelManager, plot_bar_data, plot_collection

Code¶

ModelManager Class¶

In [2]:
# ModelManager class is modified to accomodate new clustering models
class ModelManager3(ModelManager):
    def __init__(self, feature_set, targets):
        super().__init__(feature_set, targets)
    
    def train_model_reg(self):
        """
        Trains a KMeans clustering model for
        regression tasks.
        """
        # Getting training and test data
        X_train = self._train_and_test_sets.get("X_train")
        y_train = self._train_and_test_sets.get("y_train")
        
        X_test = self._train_and_test_sets.get("X_test")
        y_test = self._train_and_test_sets.get("y_test")
        
        # Fit the KMeans model on the training data
        print("Fitting model...")
        km = MiniBatchKMeans(n_clusters=50, batch_size=50)
        km.fit(X_train)
        print("Model fitting complete...")

        # Assign instances in training set to closest cluster and get mean target value of each cluster
        print("Making predictions...")
        train_labels = km.predict(X_train)
        cluster_means = [y_train[train_labels == i].mean() for i in range(km.n_clusters)]
        train_preds = np.array([cluster_means[i] for i in train_labels])

        # Predict clusters of the test set and assign mean target value of corresponding cluster to each instance
        test_labels = km.predict(X_test)
        test_preds = np.array([cluster_means[i] for i in test_labels])
        
        self._train_preds = train_preds
        self._test_preds = test_preds
        self._trained_model = km
    
    def train_model_clf(self, n_classes):
        """
        Trains a KMeans clustering model for 
        classification tasks.
        """
        assert self._train_and_test_sets != None, "You don't have your training and test sets."
        # Getting training and test data
        X_train = self._train_and_test_sets.get("X_train")
        y_train = self._train_and_test_sets.get("y_train")
        
        X_test = self._train_and_test_sets.get("X_test")
        y_test = self._train_and_test_sets.get("y_test")
        
        # Fit the KMeans model on the training data
        print("Fitting model...")
        km = MiniBatchKMeans(n_clusters=n_classes, batch_size=n_classes)
        km.fit(X_train)
        print("Model fitting complete...")

        # Predict clusters on the training set
        print("Making predictions...")
        train_preds = km.predict(X_train)

        # Predict clusters on the test set
        test_preds = km.predict(X_test)

        # Assign class variables
        self._trained_model = km
        self._train_preds = train_preds
        self._test_preds = test_preds
        
    def visualise_results_clf(self):
        """
        Creates a series of plots to visualise performance
        results for a classification model.
        """
        assert self._trained_model != None, "You haven't trained a model yet."
        # Getting training, test and predictions data
        y_train = self._train_and_test_sets.get("y_train")
        y_test = self._train_and_test_sets.get("y_test")
        train_preds = self._train_preds
        test_preds = self._test_preds

        # Get key metric plot
        key_metric_plot = self._get_key_metric_plot_clf(y_train, y_test, train_preds, test_preds)

        # True values vs predictions
        true_pred_plot = self._get_true_pred_plot(y_test, test_preds)

        # Combining plots
        plots = {
            (1,1,"Dataset",""): key_metric_plot,
            (2,1,"","Class"): true_pred_plot,
        }

        subplot_titles = [
            "Key Metrics", 
            "True vs Predicted Values",
        ]

        specs = [
            [{"type": "bar", "colspan": 2}, None],
            [{"type": "xy", "colspan": 2}, None],
        ]

        combined_plot = plot_collection(
            plots, 
            rows=2, 
            cols=2, 
            subplot_titles=subplot_titles, 
            specs=specs, 
            title="Model Performance Results", 
        )

        return combined_plot

Loading Datasets¶

In [3]:
# Productivity dataset; using optimal configuration as determined in Task3-1
gwp_dsm = DatasetManager("gwp_assessment")
gwp_dsm.load_and_preprocess([0,1,2,3], "iterative")
gwp_dsm.create_feature_set(7)
gwp_dsm.scale_feature_set()

# Star dataset; using optimal configuration as determined in Task3-1
star_dsm = DatasetManager("star_assessment")
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "knn")
star_dsm.create_feature_set(8)
star_dsm.scale_feature_set()
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...

Getting Targets and Features¶

In [4]:
# Productivity dataset
gwp_features = gwp_dsm.get_scaled_feat_ds()
gwp_targets = gwp_dsm.get_complete_ds()[:, -1]

# Star dataset
star_features = star_dsm.get_scaled_feat_ds()
star_targets = star_dsm.get_complete_ds()[:, -1]

Initialising Model Managers¶

In [5]:
# GWP dataset
gwp_mm = ModelManager3(gwp_features, gwp_targets)

# Star dataset
star_mm = ModelManager3(star_features, star_targets)

Model Evaluation¶

Methodology

  1. Datasets will be split into training and test sets.
  2. Models will be trained on training sets using mini-bached KMeans clustering.
  3. Model performance will be evaluated using selected evaluation metrics; the results will then be visualised to paint full picture of a model's performance.
  4. Steps 1-3 will be repeated for several training-test splits (80-20, 75-25, 70-30, 60-40, 50-50) to assess the effect of split ratio on model performance.

Evaluation metrics

  • Productivity dataset: accuracy, precision, recall, F1 score. These metrics are ideal metrics for evaluating classification models as they provide comprehensive insight into a model's performance. Accuracy helps understand the overall effectiveness of the model. However, it can be misleading in imbalanced datasets, which is where precision and recall come in. They provide a more nuanced view of the model's ability to correctly identify positive instances and avoid false positives. The F1 score harmonises precision and recall, offering a single metric that seeks a balance between these two characteristics, making it especially useful when the costs of false positives and false negatives are significantly different.

  • Star dataset: mean squared error (MSE), mean abolute error (MAE), R2 score. These are robust metrics for evaluating regression models, with each illuminating different aspects of model performance. MSE emphasizes larger errors by squaring residuals, making it useful when larger errors are undesirable. MAE provides a more straightforward measure of average error magnitude, regardless of direction. The R2 score complements these by providing a relative measure of how much variance the model can explain, giving a broader picture of model performance beyond just raw error. These combined provide a comprehensive assessment of the model's effectiveness.

Notes

  • Due to the size of the star dataset (as well as the limitations of the machine on which this program was developed) only small subset of the dataset (approximately 2%) will be used to train models.

80-20 Split¶

Splitting Datasets into Train and Test Sets¶

In [6]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.8, test_size=0.2)

# Splitting star dataset
star_mm.split_dataset(train_size=0.016, test_size=0.004)

Model Training¶

In [7]:
# Productivity dataset
gwp_mm.train_model_reg()
Fitting model...
Model fitting complete...
Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

In [8]:
# Star dataset
star_mm.train_model_clf(3)
Fitting model...
Model fitting complete...
Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

Results¶

In [9]:
# Star dataset
gwp_mm.visualise_results_reg()
In [10]:
# Star dataset
star_mm.visualise_results_clf()

Analysis¶

Productivity dataset

  • The model acheives a reasonably low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 score suggests that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.

Star dataset

  • The model's accuracy, precision and recall scores are consistent (allbeit low) across both the training and test sets, with the model having near-identical performance (across all metrics) on both sets.
  • The scores indicate that the model has not overfitted the data; but, given how low the scores are, it is possible that the model may have underfitted.

75-25 Split¶

Splitting Datasets into Train and Test Sets¶

In [11]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.75, test_size=0.25)

# Splitting star dataset
star_mm.split_dataset(train_size=0.015, test_size=0.005)

Model Training¶

In [12]:
# Productivity dataset
gwp_mm.train_model_reg()
Fitting model...
Model fitting complete...
Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

In [13]:
# Star dataset
star_mm.train_model_clf(3)
Fitting model...
Model fitting complete...
Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

Results¶

In [14]:
# Star dataset
gwp_mm.visualise_results_reg()
In [15]:
# Star dataset
star_mm.visualise_results_clf()

Analysis¶

Productivity dataset

  • The model acheives reasonbale low scores across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 score suggests that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.
  • The R2 score is particularly low on the test set, which could be an indication of underfitting.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring only slighter better on the testing set across all metrics.
  • The scores indicate that the model has not overfitted the data; but, given how low the scores are, it is possible that the model may have underfitted.

70-30 Split¶

Splitting Datasets into Train and Test Sets¶

In [16]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.7, test_size=0.3)

# Splitting star dataset
star_mm.split_dataset(train_size=0.014, test_size=0.006)

Model Training¶

In [17]:
# Productivity dataset
gwp_mm.train_model_reg()
Fitting model...
Model fitting complete...
Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

In [18]:
# Star dataset
star_mm.train_model_clf(3)
Fitting model...
Model fitting complete...
Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

Results¶

In [19]:
# Star dataset
gwp_mm.visualise_results_reg()
In [20]:
# Star dataset
star_mm.visualise_results_clf()

Analysis¶

Productivity dataset

  • The model acheives reasonably low scores across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 score suggests that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.
  • The R2 score is particularly low on the test set, which could be an indication of underfitting.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model having almost identical performance on both sets.
  • The scores indicate that the model has not overfitted the data; but, given how low the scores are, it is possible that the model may have underfitted.

60-40 Split¶

Splitting Datasets into Train and Test Sets¶

In [21]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.6, test_size=0.4)

# Splitting star dataset
star_mm.split_dataset(train_size=0.012, test_size=0.008)

Model Training¶

In [22]:
# Productivity dataset
gwp_mm.train_model_reg()
Fitting model...
Model fitting complete...
Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

In [23]:
# Star dataset
star_mm.train_model_clf(3)
Fitting model...
Model fitting complete...
Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

Results¶

In [24]:
# Star dataset
gwp_mm.visualise_results_reg()
In [25]:
# Star dataset
star_mm.visualise_results_clf()

Analysis¶

Productivity dataset

  • The model acheives reasonably low scores across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 score suggests that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.
  • The R2 score is particularly low on the test set, which could be an indication of underfitting.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring only slighter better on the training set across all metrics.
  • The scores indicate that the model has not overfitted the data; but, given how low the scores are, it is possible that the model may have underfitted.

50-50 Split¶

Splitting Datasets into Train and Test Sets¶

In [26]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.5, test_size=0.5)

# Splitting star dataset
star_mm.split_dataset(train_size=0.01, test_size=0.01)

Model Training¶

In [27]:
# Productivity dataset
gwp_mm.train_model_reg()
Fitting model...
Model fitting complete...
Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

In [28]:
# Star dataset
star_mm.train_model_clf(3)
Fitting model...
Model fitting complete...
Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

Results¶

In [29]:
# Star dataset
gwp_mm.visualise_results_reg()
In [30]:
# Star dataset
star_mm.visualise_results_clf()

Analysis¶

Productivity dataset

  • The model acheives reasonably low scores across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 score suggests that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.
  • The R2 score is particularly low on the test set, which could be an indication of underfitting.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model having near-identical performance (across all metrics) on both sets.
  • The scores indicate that the model has not overfitted the data; but, given how low the scores are, it is possible that the model may have underfitted.

Analysis of split ratios¶

Productivity dataset

  • The overall accuracy of the model is relatively unaffected by the changing of split ratio. The MSE and MAE scores are fairly consistent across all split ratios with only the slightest uptick as the split ratio goes from 80-20 to 50-50.
  • The R2 scores are very low for all split ratios but they vary arbitrarily. However, the 80-20 split is the only split ration where there isn't a significant gap between the training R2 score and testing R2 score.
  • 80-20 appears to be the optimal split ratio.

Star dataset

  • Performance on both the training set and test set begins to decrease slightly as the split ratio approaches 50-50; this suggests that as the model is fed less and less training data, is starts to underfit.
  • The optimal split ratio appears to be 80-20.

Mardown Answer¶

When it comes to the star dataset, the classification models (both SVM and MLP) have the clear advantage over the clustering models. Across all metrics (accuracy, precision, recall, F1 score), the classification models score roughly 20-30 percentage points higher than the clustering models. This is primarily because the classification models leverage known labels during training, which guides the models to correct their mistakes, hence refining their decision boundaries over time. The clustering models, on the other hand, rely purely on data patterns and distributions to make their groupings; this has the potential to create severe inaccuracies. Additionally, the structured approach of the classification models, especially when dealing with a small number of distinct classes, allows for the clear differentiation and better handling of complex relationships between features, thus yielding better results. Lastly, the classification models benefit from a range of techniques like regularisation, boosting, or bagging to prevent overfitting, and they are able to manage class imbalance better than the clustering models, thereby improving their overall performance.

In [ ]: